#!/bin/sh -e
# converts html to markdown
# uses an available program to fetch URL and tidy to normalize it first

REQUIRED=tidy

### common.sh

grab_url_with () {
    url="${1:?internal error: grab_url_with: url required}"

    shift
    cmdline="$@"

    prog=
    prog_opts=
    if [ -n "$cmdline" ]; then
	eval "set -- $cmdline"
	prog=$1
	shift
	prog_opts="$@"
    fi

    if [ -z "$prog" ]; then
	# Locate a sensible web grabber (note the order).
	for p in wget lynx w3m curl links w3c; do
		if pathfind $p; then
		    prog=$p
		    break
		fi
	done

	[ -n "$prog" ] || {
            errn "$THIS:  Couldn't find a program to fetch the file from URL "
	    err "(e.g. wget, w3m, lynx, w3c, or curl)."
	    return 1
	}
    else
	pathfind "$prog" || {
	    err "$THIS:  No such web grabber '$prog' found; aborting."
	    return 1
	}
    fi

    # Setup proper base options for known grabbers.
    base_opts=
    case "$prog" in
    wget)  base_opts="-O-" ;;
    lynx)  base_opts="-source" ;;
    w3m)   base_opts="-dump_source" ;;
    curl)  base_opts="" ;;
    links) base_opts="-source" ;;
    w3c)   base_opts="-n -get" ;;
    *)     err "$THIS:  unhandled web grabber '$prog'; hope it succeeds."
    esac

    err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
    eval "set -- $base_opts $prog_opts"
    $prog "$@" "$url"
}

encoding=
grabber=
nograb=
while getopts e:g:nh opt; do
    case $opt in
    e) encoding="$OPTARG" ;;
    g) grabber="$OPTARG" ;;
    n) nograb=1 ;;
    h|?)
        usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]"
        exit 2 ;;
    esac
done

shift $(($OPTIND - 1))

### postopts.sh

### singlearg.sh

inurl=
if [ -n "$1" ] && ! [ -f "$1" ]; then
    if [ -n "$nograb" ]; then
        err "'$1' not found; refusing to treat input as URL."
        exit 1
    fi
    # Treat given argument as an URL.
    inurl="$1"
fi

if [ -n "$inurl" ]; then
    err "Attempting to fetch file from '$inurl'..."

    ### tempdir.sh

    grabber_out=$THIS_TEMPDIR/grabber.out
    grabber_log=$THIS_TEMPDIR/grabber.log
    if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \
                                           2>$grabber_log; then
        errn "grab_url_with failed"
        if [ -f $grabber_log ]; then
            err " with the following error log."
            err
            cat >&2 $grabber_log
        else
            err .
        fi
        exit 1
    fi

    set -- $grabber_out
fi

if [ -z "$encoding" ] && [ "x$@" != "x" ]; then
    # Try to determine character encoding unless not specified
    # and input is STDIN.
    encoding=$(
        head "$@" |
        LC_ALL=C tr 'A-Z' 'a-z' |
        sed -ne '/<meta .*content-type.*charset=/ {
            s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
        }'
    )
fi

if [ -n "$encoding" ] && [ -n "$HAVE_ICONV" ]; then
    alias to_utf8='iconv -f "$encoding" -t utf-8'
elif [ -n "$inurl" ]; then # assume web pages are UTF-8
    alias to_utf8='cat'
fi # else just use local encoding

to_utf8 "$@" | tidy -utf8 2>/dev/null |
runpandoc -r html -w markdown -s | from_utf8
